In this document we analyze the PASS1B proteomics animal data of MoTrPAC. The flow steps are: (1) load the data from the google cloud bucket, (2) preprocess each dataset, (3) PCA analysis and correlation with metadata variables, and (4) flagging potential outliers.
# Specify the parameters required for the analysis
# This dir should have the motrpac-bic-norm-qc repo
repo_local_dir = "~/Desktop/repos/"
# Runnable command for gsutil
gsutil_cmd = "~/google-cloud-sdk/bin/gsutil"
# Where should the data be downloaded to
local_data_dir = "~/Desktop/MoTrPAC/data/pass_1b/proteomics/"
# Bucket structure is: tissue/placementtform/results/<data files>. For GET, this path should
# also have a qa_qc directory with the qc_metrics and sample_metadata files.
bucket = "gs://motrpac-release-data-staging/Results/pass1b-06/proteomics/"
# Specify bucket and local path for the phenotypic data
pheno_bucket = "gs://motrpac-internal-release2-results/pass1b-06/phenotype/"
pheno_local_dir = "~/Desktop/MoTrPAC/data/pass_1b/dmaqc_pheno/"
# specify parameters for filtering by missing values
max_na_freq = 0.66
# Should PCA use imputed data?
pca_using_impute=F
# Specify how many PCs to examine
num_pcs = 5
num_pcs_for_outlier_analysis = 3
# Specify the significance level for association analysis
# (e.g., between a PC and a clinical variable)
p_thr = 0.0001
Specifiy the pipeline, metadata, and sample variables for the analysis.
# Define technical and biological variables to be considered in the QC
pipeline_qc_cols = c("tmt_plex","num_NA")
biospec_cols = c("registration.sex","key.anirandgroup",
"registration.batchnumber",
"training.staffid",
"is_control",
"vo2.max.test.vo2_max_visit1", # this assumes that visit1's are aligned
"terminal.weight.mg","time_to_freeze",
"timepoint","bid","pid")
# load functions and libraries
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/unsupervised_normalization_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/gcp_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/qc_report_aux_functions.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/config_session.R"))
source(paste0(repo_local_dir,"motrpac-bic-norm-qc/tools/association_analysis_methods.R"))
Download the data
obj = download_bucket_to_local_dir(bucket,local_path=local_data_dir,
remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
The code belows take the metadata and ration tables. For raw intensities (rii) - specify the matrix regex in “parse_proteomics_datasets_from_download_obj”.
obj = list(
downloaded_files = list.files(local_data_dir,full.names = T,recursive = T),
local_path = local_data_dir
)
proteomics_data = parse_proteomics_datasets_from_download_obj(
obj,local_path=local_data_dir,remove_prev_files = T,GSUTIL_PATH=gsutil_cmd
)
## [1] "t55-gastrocnemius,prot-ph"
## [1] "t55-gastrocnemius,prot-pr"
## [1] "t58-heart,prot-ph"
## [1] "t58-heart,prot-pr"
## [1] "t68-liver,prot-ph"
## [1] "t68-liver,prot-pr"
## [1] "t70-white-adipose,prot-ph"
## [1] "t70-white-adipose,prot-pr"
failed_datasets = proteomics_data$failed_datasets
proteomics_data = proteomics_data$proteomics_parsed_datasets
# this is commented out: the printed buckets here will refer to the "ac" datasets,
# and these were not submitted for this release.
# if(length(failed_datasets)>1){
# print("Reading the data failed form some buckets, details:")
# write.table(failed_datasets,row.names = F,quote=F,sep="\t",col.names = F)
# }
# Add the pheno data
pheno_data = parse_pheno_data(pheno_bucket,local_path = pheno_local_dir,
remove_prev_files = T,GSUTIL_PATH=gsutil_cmd)
# add a tissue variable (for convinience)
pheno_data$viallabel_data$tissue =
pheno_data$viallabel_data$specimen.processing.sampletypedescription
# add the time to freeze variable ((for convinience))
pheno_data$viallabel_data$time_to_freeze =
pheno_data$viallabel_data$calculated.variables.frozetime_after_train -
pheno_data$viallabel_data$calculated.variables.deathtime_after_train
# add a binary is_control variable
pheno_data$viallabel_data$is_control = as.numeric(grepl("control",
pheno_data$viallabel_data$key.anirandgroup,ignore.case = T))
# add the timepoint as a number
# x - the text description of the group
get_numeric_timepoint<-function(x){
v = rep(0,length(x))
tps = c("Eight"=8,"Four"=4)
for(tp in names(tps)){
v[grepl(paste0(tp,"-week"),x,perl = T,ignore.case = T)] = tps[tp]
}
return(v)
}
pheno_data$viallabel_data$timepoint = get_numeric_timepoint(
pheno_data$viallabel_data$key.anirandgroup
)
We use vp.misc’s remove_batch_effects to account for plex variation. We also keep a version of the data with imputed missing values. The new data objects are saved into the proteomics_data list.
for(dataset_name in names(proteomics_data)){
print(paste("processing",dataset_name))
curr_data = proteomics_data[[dataset_name]]$sample_data
# median value normalization
sample_meta = proteomics_data[[dataset_name]]$sample_meta[colnames(curr_data),]
# Exclude features with many NAs
row_NA_data = rowSums(is.na(curr_data))/ncol(curr_data)
feature_inds = row_NA_data < max_na_freq
# filter and normalize
norm_data = as.matrix(curr_data[feature_inds,])
norm_data = run_median_mad_norm(norm_data)
# b for batch corrected
norm_data_b = limma::removeBatchEffect(norm_data,sample_meta$tmt_plex)
# Impute - use capture.output to avoid prints
# This will be used for PCA
capture.output({
norm_data_b_imp = impute::impute.knn(as.matrix(norm_data_b))$data
norm_data_imp = impute::impute.knn(as.matrix(norm_data))$data
},file=NULL)
# Median normalization (per MOP)
# norm_data_imp = run_median_mad_norm(norm_data_imp)
if(any(is.na(norm_data_imp))){
print(paste("Imputation failed in:",dataset_name))
}
proteomics_data[[dataset_name]][["norm_filtered"]] = list(
norm_data_b = norm_data_b,
norm_data_b_imp = norm_data_b_imp,
norm_data = norm_data,
norm_data_imp = norm_data_imp,
feature_inds = feature_inds
)
print(paste0('done, objects saved into proteomics_data[["',
dataset_name,'"]][[,norm_filtered]]'))
}
Examine the effect of the normalization process, which includes removing features with many NAs. For each dataset show the boxplots of three randomly selected plexes.
for(i in 1:length(proteomics_data)){
unnorm_data = proteomics_data[[i]]$sample_data
norm_data = proteomics_data[[i]][["norm_filtered"]][[1]]
plex = proteomics_data[[i]]$sample_meta[colnames(norm_data),"tmt_plex"]
plex = as.factor(plex)
# order the data by the plexes
ord = order(plex)
unnorm_data = unnorm_data[,ord]
norm_data = norm_data[,ord]
plex = plex[ord]
# take a subset of the plexes
samp_plex = sample(unique(plex))[1:3]
samp = which(plex %in% samp_plex)
# plot
curr_name = names(proteomics_data)[i]
colnames(unnorm_data) = NULL;colnames(norm_data) = NULL
boxplot(unnorm_data[,samp],
main=paste0(curr_name,"\nraw data (",nrow(unnorm_data)," features)"),
ylab = "log2 ratio",xaxt="n",col=plex)
boxplot(norm_data[,samp],
main=paste0(curr_name,"\nnorm data (ratios) (",nrow(norm_data)," features)"),
ylab = "log2 ratio",xaxt="n",col=plex)
# # Check with and without batch effects analysis
# par(mfrow=c(1,3))
# norm_data = proteomics_data[[i]][["norm_filtered"]][[2]]
# boxplot(norm_data[,samp],
# main=paste0(curr_name,"\nnorm data (",nrow(norm_data)," features)"),
# ylab = "log2 ratio",xaxt="n",col=plex)
# norm_data2 = limma::removeBatchEffect(norm_data,plex)
# es = ExpressionSet(norm_data)
# es$plex = plex
# norm_data3 = remove_batch_effect(es,"plex")
# norm_data3 = exprs(norm_data3)
# boxplot(norm_data2[,samp],
# main=paste0(curr_name,"\nnorm data + limma (",nrow(norm_data)," features)"),
# ylab = "log2 ratio",xaxt="n",col=plex)
# boxplot(norm_data3[,samp],
# main=paste0(curr_name,"\nnorm data + vp.misc (",nrow(norm_data)," features)"),
# ylab = "log2 ratio",xaxt="n",col=plex)
#
# norm_data4 = norm_data4
# for(k in 1:5){
# norm_data4 = run_median_mad_norm(norm_data4)
# norm_data4 = limma::removeBatchEffect(norm_data4,plex)
# }
# boxplot(norm_data4[,samp],
# main=paste0(curr_name,"\nnorm data + vp.misc (",nrow(norm_data)," features)"),
# ylab = "log2 ratio",xaxt="n",col=plex)
}
Run PCA analysis on the imputed data, with and without mitigating batch effects:
for(dataset_name in names(proteomics_data)){
data_obj = proteomics_data[[dataset_name]]
data_samples = colnames(data_obj[["norm_filtered"]][["norm_data_b"]])
randgroup = pheno_data$viallabel_data[data_samples,"key.anirandgroup"]
sex = pheno_data$viallabel_data[data_samples,"registration.sex"]
sex[sex=="1"] = "F";sex[sex=="2"]="M"
plex = data_obj$sample_meta[data_samples,"tmt_plex"]
# pca after removing batch effects
if(pca_using_impute){
curr_data = data_obj[["norm_filtered"]][["norm_data_b_imp"]]
}
else{
curr_data = data_obj[["norm_filtered"]][["norm_data_b"]]
curr_data = curr_data[rowSums(is.na(curr_data))==0,]
}
curr_pca = prcomp(t(curr_data),scale. = T,center = T)
curr_pcax = curr_pca$x[,1:num_pcs]
explained_var = summary(curr_pca)[["importance"]][3,5]
df = data.frame(curr_pcax,sex=sex,plex=plex,randgroup = randgroup)
proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]] = list(
pcax = df,explained_var=explained_var
)
# pca without removing batch effects
if(pca_using_impute){
curr_data = data_obj[["norm_filtered"]][["norm_data_imp"]]
}
else{
curr_data = data_obj[["norm_filtered"]][["norm_data"]]
curr_data = curr_data[rowSums(is.na(curr_data))==0,]
}
curr_pca = prcomp(t(curr_data),scale. = T,center = T)
curr_pcax = curr_pca$x[,1:num_pcs]
explained_var = summary(curr_pca)[["importance"]][3,5]
df = data.frame(curr_pcax,sex=sex,plex=plex,randgroup = randgroup)
proteomics_data[[dataset_name]][["norm_filtered"]][["pca_w_b"]] = list(
pcax = df,explained_var=explained_var
)
}
PCA plots:
for(dataset_name in names(proteomics_data)){
df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_w_b"]]$pcax
p = ggplot(df) +
geom_point(aes(x=PC1, y=PC2,col=plex, shape=sex)) +
ggtitle(paste(dataset_name,"before batch effects analysis",sep=": "))
plot(p)
df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]]$pcax
p = ggplot(df) +
geom_point(aes(x=PC1, y=PC2,col=plex, shape=sex)) +
ggtitle(paste(dataset_name,"after batch effects analysis",sep=": "))
plot(p)
}
We plot the top two PCs for each tissue, color and shape correspond to randomization group and sex.
for(dataset_name in names(proteomics_data)){
df = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]]$pcax
p = ggplot(df) +
geom_point(aes(x=PC1, y=PC2,col=randgroup, shape=sex)) +
ggtitle(dataset_name)
plot(p)
}
Here we analyze the top principal components (5) in each tissue and compute their association with the selected variables above (e.g., the pipeline qc metrics). We use Spearman correlation and a linear test for significance. For each dataset we also add the correlations among the metadata variables. These two analyses should be interpreted together, as the analyzed variables are not independent.
pcs_vs_qc_var_report = c()
metadata_variable_assoc_report = c()
for(dataset_name in names(proteomics_data)){
# Get the projected PCs
curr_pcax = proteomics_data[[dataset_name]][["norm_filtered"]][["pca_b"]][[1]]
curr_pcax = curr_pcax[,grepl("^PC",names(curr_pcax))]
curr_pipeline_meta = proteomics_data[[dataset_name]]$sample_meta
curr_data_samples = rownames(curr_pcax)
curr_meta = pheno_data$viallabel_data[curr_data_samples,biospec_cols]
# remove metadata variables with NAs
curr_meta = curr_meta[,!apply(is.na(curr_meta),2,any)]
# remove bid and pid
curr_meta = curr_meta[,!grepl("pid|bid",names(curr_meta))]
# Add batch
curr_meta$plex = as.numeric(
as.factor(curr_pipeline_meta[curr_data_samples,"tmt_plex"]))
curr_meta_numeric_cols = sapply(curr_meta,is.numeric)
# Remove zero sd columns
curr_meta_numeric_cols[curr_meta_numeric_cols] =
apply(curr_meta[,curr_meta_numeric_cols],2,sd)>0
corrs = cor(curr_pcax,curr_meta[,curr_meta_numeric_cols],method="spearman")
corrsp = pairwise_eval(
curr_pcax,curr_meta[,curr_meta_numeric_cols],func=pairwise_association_wrapper,
f=1,max_num_vals_for_discrete=8)
# Some ggplots have to be printed to be shown in the notebook
print(ggcorrplot(t(corrs),lab=T,lab_size=2.5,hc.order = F) +
ggtitle(dataset_name) +
theme(plot.title = element_text(hjust = 0.5,size=20)))
for(i in 1:nrow(corrsp)){
for(j in 1:ncol(corrsp)){
if(corrsp[i,j]>p_thr){next}
pcs_vs_qc_var_report = rbind(pcs_vs_qc_var_report,
c(dataset_name,
rownames(corrsp)[i],colnames(corrsp)[j],corrs[i,j],corrsp[i,j])
)
}
}
colnames(pcs_vs_qc_var_report) = c("Dataset(tissue,site)","PC",
"qc_metric","rho(spearman)","p-value")
####
# compute correlations among the metadata variables
####
corrs = cor(curr_meta[,curr_meta_numeric_cols],method="spearman")
corrsp = pairwise_eval(
curr_meta[,curr_meta_numeric_cols],func=pairwise_association_wrapper,
f=1,max_num_vals_for_discrete=8)
# Some ggplots have to be printed to be shown in the notebook
print(ggcorrplot(corrs,lab=T,lab_size=2.5,hc.order = T))
for(n1 in rownames(corrsp)){
for(n2 in rownames(corrsp)){
if(n1==n2){break}
if(n1 %in% biospec_cols &&
n2 %in% biospec_cols) {next}
if(corrsp[n1,n2]>p_thr){next}
metadata_variable_assoc_report = rbind(metadata_variable_assoc_report,
c(dataset_name,n1,n2,corrs[n1,n2],corrsp[n1,n2])
)
}
}
if(!is.null(dim(metadata_variable_assoc_report))){
colnames(metadata_variable_assoc_report) = c(
"Dataset(tissue,site)","Var1","Var2","rho(spearman)","p-value")
}
}
# Format the reports - for a nicer presentation in a table
pcs_vs_qc_var_report[,5] = format(
as.numeric(pcs_vs_qc_var_report[,5]),digits=3)
pcs_vs_qc_var_report[,4] = format(
as.numeric(pcs_vs_qc_var_report[,4]),digits=3)
metadata_variable_assoc_report[,5] = format(
as.numeric(metadata_variable_assoc_report[,5]),digits=3)
metadata_variable_assoc_report[,4] = format(
as.numeric(metadata_variable_assoc_report[,4]),digits=3)
kable(pcs_vs_qc_var_report,longtable=T,
caption = "Correlations between PCs and other variables") %>%
kable_styling(latex_options = c("repeat_header"),font_size = 8)
| Dataset(tissue,site) | PC | qc_metric | rho(spearman) | p-value |
|---|---|---|---|---|
| t55-gastrocnemius,prot-ph | PC3 | registration.sex | -0.647 | 1.46e-07 |
| t55-gastrocnemius,prot-ph | PC3 | terminal.weight.mg | -0.542 | 3.36e-06 |
| t55-gastrocnemius,prot-pr | PC1 | registration.sex | -0.614 | 3.68e-07 |
| t55-gastrocnemius,prot-pr | PC1 | training.staffid | 0.601 | 3.35e-07 |
| t55-gastrocnemius,prot-pr | PC1 | terminal.weight.mg | -0.468 | 3.88e-06 |
| t55-gastrocnemius,prot-pr | PC5 | vo2.max.test.vo2_max_visit1 | 0.371 | 8.52e-05 |
| t58-heart,prot-ph | PC1 | registration.sex | 0.853 | 3.92e-15 |
| t58-heart,prot-ph | PC1 | training.staffid | -0.609 | 2.06e-08 |
| t58-heart,prot-ph | PC1 | terminal.weight.mg | 0.747 | 1.17e-13 |
| t58-heart,prot-pr | PC1 | registration.sex | -0.866 | 9.31e-29 |
| t58-heart,prot-pr | PC1 | training.staffid | 0.520 | 6.27e-06 |
| t58-heart,prot-pr | PC1 | vo2.max.test.vo2_max_visit1 | 0.560 | 2.84e-06 |
| t58-heart,prot-pr | PC1 | terminal.weight.mg | -0.737 | 6.09e-22 |
| t68-liver,prot-ph | PC1 | registration.sex | 0.853 | 3.92e-15 |
| t68-liver,prot-ph | PC1 | training.staffid | -0.609 | 2.06e-08 |
| t68-liver,prot-ph | PC1 | terminal.weight.mg | 0.747 | 1.17e-13 |
| t68-liver,prot-pr | PC1 | registration.sex | -0.864 | 1.59e-34 |
| t68-liver,prot-pr | PC1 | vo2.max.test.vo2_max_visit1 | 0.588 | 1.56e-07 |
| t68-liver,prot-pr | PC1 | terminal.weight.mg | -0.734 | 8.37e-24 |
| t70-white-adipose,prot-ph | PC1 | registration.sex | -0.866 | 3.55e-34 |
| t70-white-adipose,prot-ph | PC1 | training.staffid | 0.472 | 6.83e-05 |
| t70-white-adipose,prot-ph | PC1 | vo2.max.test.vo2_max_visit1 | 0.601 | 1.65e-07 |
| t70-white-adipose,prot-ph | PC1 | terminal.weight.mg | -0.748 | 1.09e-24 |
| t70-white-adipose,prot-ph | PC5 | timepoint | 0.415 | 9.92e-05 |
| t70-white-adipose,prot-pr | PC1 | registration.sex | -0.866 | 5.12e-34 |
| t70-white-adipose,prot-pr | PC1 | training.staffid | 0.539 | 3.47e-05 |
| t70-white-adipose,prot-pr | PC1 | vo2.max.test.vo2_max_visit1 | 0.624 | 8.09e-08 |
| t70-white-adipose,prot-pr | PC1 | terminal.weight.mg | -0.771 | 1.78e-24 |
| t70-white-adipose,prot-pr | PC5 | is_control | 0.587 | 4.09e-07 |
kable(metadata_variable_assoc_report,longtable=T,
caption = "Correlations between metadata variables")%>%
kable_styling(latex_options = c("repeat_header"),font_size = 8)
In this analysis, outliers are flagged by examining the boxplot of each PC, extending its whiskers to three times the inter-quantile range away from the boxplot. Samples outside this range are then flagged.
Note that samples are flagged using an automatic analysis of the principal components. Such analyses may flag samples because of true biological effects and thus further examination is required before determining if flagged samples represent low quality samples. Moreover, note that we plot outliers for the rii (raw intensitied) data as well. Typically, outliers from such datasets will not appear in the bettern normalized ratio data.
pca_outliers_report = c()
for(dataset_name in names(proteomics_data)){
# Get the projected PCs
curr_pcax = proteomics_data[[dataset_name]][["norm_filtered"]][["pca"]][[1]]
# Univariate: use IQRs
pca_outliers = c()
for(j in 1:num_pcs_for_outlier_analysis){
outlier_values <- boxplot.stats(curr_pcax[,j],coef = 3)$out
for(outlier in names(outlier_values)){
pca_outliers_report = rbind(pca_outliers_report,
c(dataset_name,paste("PC",j,sep=""),outlier,
format(outlier_values[outlier],digits=5))
)
if(!is.element(outlier,names(pca_outliers))){
pca_outliers[outlier] = outlier_values[outlier]
}
}
}
# Plot the outliers
if(length(pca_outliers)>0){
df = data.frame(curr_pcax,
outliers = rownames(curr_pcax) %in% names(pca_outliers))
col = rep("black",nrow(df))
col[df$outliers] = "green"
plot(df$PC1,df$PC2,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
main = paste(dataset_name,"flagged outliers"),
xlab = "PC1",ylab="PC2")
plot(df$PC3,df$PC4,pch = as.numeric(df$outliers),col=col,lwd=2,cex=1,
main = paste(dataset_name,"flagged outliers"),
xlab = "PC3",ylab="PC4")
}
}
if(!is.null(dim(pca_outliers_report))){
colnames(pca_outliers_report) = c("dataset","PC","sample","score")
}
Here we map features to their chromosomes and then use the X and Y chromosomes to train a classifier for predicting sex. Note that unlike the sequencing platforms we use the molecular features from the data, which requires running a regularized learning algorithm. We use linear SVM as it seems to produce reasonable results.
entrez2chr = as.list(org.Rn.egCHR)
symbol2entrez = as.list(org.Rn.egSYMBOL2EG)
sex_pred_err_rates = c()
sex_check_outliers = c()
for(dataset_name in names(proteomics_data)){
curr_data = proteomics_data[[dataset_name]][["norm_filtered"]][[2]]
curr_annot = proteomics_data[[dataset_name]]$row_annot
# get the correct row annotation (was filtered by NAs above)
feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
curr_annot = curr_annot[feature_inds,]
# map to entrez ids
curr_entrez = symbol2entrez[curr_annot[,"gene_symbol"]]
# take rows with a unique entrez id
single_entrez = sapply(curr_entrez,length)==1
curr_data = curr_data[single_entrez,]
curr_annot = curr_annot[single_entrez,]
curr_entrez = unlist(curr_entrez[single_entrez])
# map to chromosomes
curr_chrs = entrez2chr[curr_entrez]
# limit the data to X and Y chromosomes
chrr_y_genes = sapply(curr_chrs,function(x)any(x=="Y" | x=="X"))
chrr_y_genes[is.na(chrr_y_genes)] = F
if(sum(chrr_y_genes,na.rm=T)<2){next}
sex = pheno_data$viallabel_data[colnames(curr_data),"registration.sex"]
df = data.frame(sex=as.factor(sex),t(curr_data[chrr_y_genes,]))
df = df[,colSums(is.na(df))==0]
# run an svm classifier
train_control <- caret::trainControl(method = "cv", number = nrow(df),
savePredictions = TRUE)
# train the model on training set
model <- caret::train(sex ~ .,data = df,
trControl = train_control,method = "svmLinear2")
# loocv results
cv_res = model$pred
cv_res = cv_res[cv_res[,"cost"]==0.5,]
cv_errors = cv_res[,1] != cv_res[,2]
acc = 1 - (sum(cv_errors)/nrow(cv_res))
print(paste(
"In dataset",dataset_name,
"sex prediction using X,Y chromosomes, cross-validation accuracy is:",
round(acc,digits = 3)))
sex_pred_err_rates = rbind(sex_pred_err_rates,
c(dataset_name,mean(cv_errors))
)
err_samples = rownames(df)[cv_errors]
for(samp in err_samples){
sex_check_outliers = rbind(
sex_check_outliers,c(dataset_name,samp))
}
}
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t55-gastrocnemius,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.967"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t55-gastrocnemius,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.983"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t58-heart,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t58-heart,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t68-liver,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t68-liver,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 0.983"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t70-white-adipose,prot-ph sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
## [1] "In dataset t70-white-adipose,prot-pr sex prediction using X,Y chromosomes, cross-validation accuracy is: 1"
if(! is.null(dim(sex_check_outliers))){
colnames(sex_check_outliers) = c("dataset","sample")
}
sample2dataset = c()
for(dataset_name in names(proteomics_data)){
sample2dataset[colnames(proteomics_data[[dataset_name]]$sample_data)] = dataset_name
}
# add the dataset name to mop outliers
all_flagged = NULL
if(length(pca_outliers_report)>0){
all_flagged = union(all_flagged,pca_outliers_report[,"sample"])
}
if(length(sex_check_outliers)>0){
all_flagged = union(all_flagged,sex_check_outliers[,2])
}
flagged_sample_report = c()
for(samp in all_flagged){
samp_dataset = sample2dataset[samp]
samp_metric = NULL
if(!is.null(dim(sex_check_outliers)) &&
samp %in% sex_check_outliers[,"sample"]){
samp_metric = c(samp_metric,"Sex-flagged")
}
if(!is.null(dim(pca_outliers_report)) &&
samp %in% pca_outliers_report[,"sample"]){
curr_pcs = pca_outliers_report[pca_outliers_report[,"sample"]==samp,"PC"]
samp_metric = c(samp_metric,curr_pcs)
}
flagged_sample_report = rbind(flagged_sample_report,
c(samp,samp_dataset,paste(samp_metric,collapse=","))
)
}
colnames(flagged_sample_report) = c("Viallabel","Dataset","Methods")
kable(flagged_sample_report,longtable=T,
caption = "Outliers detected by tissue PCA data")%>%
kable_styling(font_size = 8,latex_options = c("hold_position", "repeat_header"))
| Viallabel | Dataset | Methods |
|---|---|---|
| 90223015507 | t55-gastrocnemius,prot-pr | Sex-flagged |
| 90252015507 | t55-gastrocnemius,prot-pr | Sex-flagged |
| 90560016806 | t68-liver,prot-pr | Sex-flagged |
# use the raw bucket names as the output location
# (implied assumption: rna_seq_data and norm_rnaseq_data have the same order)
# When the release data become ready - out_bucket
# should be the same as the input (out_bucket == bucket)
out_bucket = bucket
for(dataset_name in names(proteomics_data)){
x = proteomics_data[[dataset_name]][["norm_filtered"]][[1]]
x_imp = proteomics_data[[dataset_name]][["norm_filtered"]][[2]]
x_annot = proteomics_data[[dataset_name]]$row_annot
# get the correct row annotation (was filtered by NAs above)
feature_inds = proteomics_data[[dataset_name]][["norm_filtered"]]$feature_inds
x_annot = x_annot[feature_inds,]
out_b = paste0(out_bucket,gsub(",","/",dataset_name),"/")
rownames(x) = 1:nrow(x)
rownames(x_annot) = rownames(x)
# add bid and pid
x_bids = pheno_data$viallabel_data[colnames(x),"bid"]
x_pids = pheno_data$viallabel_data[colnames(x),"pid"]
newx = x
for(j in 1:ncol(newx)){newx[,j] = round(as.numeric(newx[,j]),digits=5)}
newx = rbind(x_bids,newx)
newx = rbind(x_pids,newx)
newx = rbind(colnames(x),newx)
colnames(newx) = NULL
newx_imp = x_imp
for(j in 1:ncol(newx_imp)){newx_imp[,j] = round(as.numeric(newx_imp[,j]),digits=5)}
newx_imp = rbind(x_bids,newx_imp)
newx_imp = rbind(x_pids,newx_imp)
newx_imp = rbind(colnames(x_imp),newx_imp)
colnames(newx_imp) = NULL
# a simple sanity check
m = newx[-c(1:3),]
mode(m) = "numeric"
if(!all(abs(m-x)< 1e-05,na.rm=T)){
print("Error in parsing the matrix, breaking")
break
}
# a simple sanity check
m = newx_imp[-c(1:3),]
mode(m) = "numeric"
if(!all(abs(m-x)< 1e-05,na.rm=T)){
print("Error in parsing the matrix, breaking")
break
}
rownames(newx)[1:3] = c("viallabel","pid","bid")
rownames(newx_imp)[1:3] = c("viallabel","pid","bid")
# save the output to the target bucket
arr = strsplit(dataset_name,split=",")[[1]]
curr_ome = tolower(arr[2])
curr_t = tolower(arr[1])
local_fname = paste0(local_data_dir,
"motrpac_pass1b-06_",curr_t,"_",curr_ome,
"_med-mad-normalized-logratio.txt")
write.table(newx,file=local_fname,sep="\t",quote=F,
row.names = T,col.names = F)
cmd = paste(gsutil_cmd,"cp",local_fname,out_b)
system(cmd)
local_fname_imp = paste0(local_data_dir,
"motrpac_pass1b-06_",curr_t,"_",curr_ome,
"_med-mad-normalized-imputed-logratio.txt")
write.table(newx_imp,file=local_fname_imp,sep="\t",quote=F,
row.names = T,col.names = F)
cmd = paste(gsutil_cmd,"cp",local_fname_imp,out_b)
system(cmd)
local_fname2 = paste0(local_data_dir,
"motrpac_pass1b-06_",curr_t,"_",curr_ome,
"_normalized-data-feature-annot.txt")
write.table(x_annot,file=local_fname2,sep="\t",quote=F,
row.names = T,col.names = T)
cmd = paste(gsutil_cmd,"cp",local_fname2,out_b)
system(cmd)
}